import pandas as pd
import numpy as np

# read in 2014 operational dsata
individual2014 = pd.read_csv("/REDACTED/data/final/individualBISG2014_full_final.csv")
len(individual2014)
individual2014 = individual2014[individual2014.predicted_prob_black.notnull()]
len(individual2014)

# split into 20 equal-width bins based on BIFSG-predicted probability black
bins_20= [round(x,2) for x in list(np.linspace(0,1,21))]
labels_20 = list(range(1,21))
individual2014['binned_20'] = pd.cut(individual2014['predicted_prob_black'], bins=bins_20)
individual2014['binned_20_labs'] = pd.cut(individual2014['predicted_prob_black'], bins=bins_20, labels=labels_20)


fracs=[]
aud_rates=[]
avg_ppbs=[]
bin_list=[]

# calculate fraction of taxpayers, audit rate, and average predicted probability black within each bin
for bin in individual2014.binned_20_labs.unique().sort_values():
    print(bin)
    df = individual2014[individual2014['binned_20_labs'] == bin]
    fracs.append(float(len(df)) / len(individual2014))
    aud_rates.append(float(len(df[df['aud_no_research_audits'] == 1]))/len(df))
    avg_ppbs.append(df.predicted_prob_black.mean())
    bin_list.append(bin)

# zip into a table and clean
df_20 = pd.DataFrame(list(zip(fracs, aud_rates, avg_ppbs, bin_list)), columns=['frac', 'aud_rate', 'avg_ppb', 'bin'])
df_20.head()
df_20.to_csv('/REDACTED/data/final/sharpness_table_20.csv', index=False)

df_20_rd = round(df_20, 4)
df_20_rd = df_20_rd[['bin', 'avg_ppb', 'frac', 'aud_rate']]

df_20_rd.to_csv('/REDACTED/sharpness_table_20_rd.csv', index=False)

# output table as pdf
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
fig, ax = plt.subplots(figsize=(12,4))
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=df_20_rd.values, colLabels=df_20_rd.columns, loc='center')

table.auto_set_font_size(False)
table.set_fontsize(8)
table.auto_set_column_width(col=list(range(len(df_20_rd.columns))))

pp = PdfPages('/REDACTED/df_20_rd.pdf')
pp.savefig(fig, bbox_inches='tight')
pp.close()